Enter the directory of the maca folder on your drive and the name of the tissue you want to analyze.
tissue_of_interest = "Aorta"
Load the requisite packages and some additional helper functions.
library(here)
here() starts at /Users/olgabot/code/tabula-muris
library(useful)
Loading required package: ggplot2
library(Seurat)
Loading required package: cowplot
Attaching package: 'cowplot'
The following object is masked from 'package:ggplot2':
ggsave
Loading required package: Matrix
Warning: namespace 'Biobase' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'lme4' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'MatrixModels' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'Biobase' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'lme4' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
Warning: namespace 'MatrixModels' is not available and has been replaced
by .GlobalEnv when processing object 'call.'
library(dplyr)
Warning: package 'dplyr' was built under R version 3.4.2
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(Matrix)
save_dir = here('00_data_ingest', 'tissue_robj')
Load the plate metadata. Check which plates have been downloaded.
plate_metadata_filename = here('00_data_ingest', '00_facs_raw_data', 'metadata_FACS.csv')
plate_metadata <- read.csv(plate_metadata_filename, sep=",", header = TRUE)
colnames(plate_metadata)[1] <- "plate.barcode"
plate_metadata
“Aorta” is a subtissue of “Heart” and we will have to filter for this.
tissue_plates = filter(plate_metadata, subtissue == tissue_of_interest)[,c('plate.barcode','tissue','subtissue','mouse.sex')]
tissue_plates
Load the read count data.
#Load the gene names and set the metadata columns by opening the first file
filename = here('00_data_ingest', '00_facs_raw_data', 'FACS', 'Heart-counts.csv')
raw.data = read.csv(filename, sep=",", row.names=1)
# raw.data = data.frame(row.names = rownames(raw.data))
corner(raw.data)
Get the plate barcode of each individual cell
plate.barcodes = lapply(colnames(raw.data), function(x) strsplit(strsplit(x, "_")[[1]][1], '.', fixed=TRUE)[[1]][2])
meta.data = plate_metadata[plate_metadata$plate.barcode %in% plate.barcodes, ]
dim(plate_metadata)
[1] 247 6
dim(meta.data)
[1] 36 6
corner(meta.data)
Use only cells from “Aorta”" plate barcodes.
aorta.plates = filter(tissue_plates, subtissue == "Aorta")
aorta.barcodes = plate.barcodes %in% aorta.plates$plate.barcode
sum(aorta.barcodes)
[1] 1113
raw.data = raw.data[aorta.barcodes]
Create per-cell metadata from plate barcode metadata
barcode.df = t.data.frame(as.data.frame(plate.barcodes[aorta.barcodes]))
head(barcode.df)
[,1]
X.MAA000594. "MAA000594"
X.MAA000594..1 "MAA000594"
X.MAA000594..2 "MAA000594"
X.MAA000594..3 "MAA000594"
X.MAA000594..4 "MAA000594"
X.MAA000594..5 "MAA000594"
MAA000594
MAA000594
MAA000594
MAA000594
MAA000594
MAA000594
rownames(barcode.df) = colnames(raw.data)
colnames(barcode.df) = c('plate.barcode')
head(barcode.df)
plate.barcode
A21.MAA000594.3_8_M.1.1 "MAA000594"
D1.MAA000594.3_8_M.1.1 "MAA000594"
F8.MAA000594.3_8_M.1.1 "MAA000594"
H11.MAA000594.3_8_M.1.1 "MAA000594"
N15.MAA000594.3_8_M.1.1 "MAA000594"
P3.MAA000594.3_8_M.1.1 "MAA000594"
MAA000594
MAA000594
MAA000594
MAA000594
MAA000594
MAA000594
rnames = row.names(barcode.df)
meta.data <- merge(barcode.df, plate_metadata, by='plate.barcode', sort = F)
row.names(meta.data) <- rnames
head(meta.data)
Process the raw data and load it into the Seurat object.
# Find ERCC's, compute the percent ERCC, and drop them from the raw data.
erccs <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = TRUE)
percent.ercc <- Matrix::colSums(raw.data[erccs, ])/Matrix::colSums(raw.data)
ercc.index <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = FALSE)
raw.data <- raw.data[-ercc.index,]
# Create the Seurat object with all the data
tiss <- CreateSeuratObject(raw.data = raw.data, project = tissue_of_interest,
min.cells = 5, min.genes = 5)
tiss <- AddMetaData(object = tiss, meta.data)
tiss <- AddMetaData(object = tiss, percent.ercc, col.name = "percent.ercc")
# Change default name for sums of counts from nUMI to nReads
colnames(tiss@meta.data)[colnames(tiss@meta.data) == 'nUMI'] <- 'nReads'
# Create metadata columns for annotations and subannotations
tiss@meta.data[,'annotation'] <- NA
tiss@meta.data[,'subannotation'] <- NA
Calculate percent ribosomal genes.
ribo.genes <- grep(pattern = "^Rp[sl][[:digit:]]", x = rownames(x = tiss@data), value = TRUE)
percent.ribo <- Matrix::colSums(tiss@raw.data[ribo.genes, ])/Matrix::colSums(tiss@raw.data)
tiss <- AddMetaData(object = tiss, metadata = percent.ribo, col.name = "percent.ribo")
A sanity check: genes per cell vs reads per cell.
GenePlot(object = tiss, gene1 = "nReads", gene2 = "nGene", use.raw=T)
Filter out cells with few reads and few genes.
tiss <- FilterCells(object = tiss, subset.names = c("nGene", "nReads"),
low.thresholds = c(500, 50000), high.thresholds = c(25000, 2000000))
Normalize the data, then regress out correlation with total reads
tiss <- NormalizeData(object = tiss)
tiss <- ScaleData(object = tiss, vars.to.regress = c("nReads", "percent.ribo","Rn45s"))
[1] "Regressing out nReads" "Regressing out percent.ribo"
[3] "Regressing out Rn45s"
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
[1] "Scaling data matrix"
|
| | 0%
|
|=================================================================| 100%
tiss <- FindVariableGenes(object = tiss, do.plot = TRUE, x.high.cutoff = Inf, y.cutoff = 0.5)
Run Principal Component Analysis.
tiss <- RunPCA(object = tiss, do.print = FALSE)
tiss <- ProjectPCA(object = tiss, do.print = FALSE)
Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
PCElbowPlot(object = tiss)
Choose the number of principal components to use.
# Set number of principal components.
n.pcs = 10
The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity–more connections within the group than between groups. The resolution parameter determines the scale…higher resolution will give more clusters, lower resolution will give fewer.
For the top-level clustering, aim to under-cluster instead of over-cluster. It will be easy to subset groups and further analyze them below.
# Set resolution
res.used <- 0.5
tiss <- FindClusters(object = tiss, reduction.type = "pca", dims.use = 1:n.pcs,
resolution = res.used, print.output = 0, save.SNN = TRUE)
To visualize
# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
tiss <- RunTSNE(object = tiss, dims.use = 1:n.pcs, seed.use = 10, perplexity=30)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = tiss, do.label = T)
Check expression of genes of interset.
Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest.
How big are the clusters?
table(tiss@ident)
0 1 2 3 4 5
127 84 54 47 35 17
Which markers identify a specific cluster?
clust.markers <- FindMarkers(object = tiss, ident.1 = 0, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
print(x = head(x= clust.markers, n = 10))
p_val avg_diff pct.1 pct.2
Sec61g 1.152276e-40 0.7004142 0.583 0.633
Krt5 1.674866e-30 0.7094954 0.315 0.160
Rn45s 6.425642e-29 0.4682059 1.000 1.000
Beta-s 2.380836e-28 5.8723003 0.465 0.110
Atp5k 8.216369e-26 0.2670373 0.323 0.574
Fscn1 1.178890e-25 0.7156130 0.142 0.435
Hba-a1 3.636674e-25 3.9809616 0.425 0.038
Mkrn1 2.849556e-24 2.4082606 0.496 0.304
Ubb 9.499163e-24 0.3871989 0.819 0.987
Rps8 2.521841e-21 0.2659899 0.528 0.882
You can also compute all markers for all clusters at once. This may take some time.
tiss.markers <- FindAllMarkers(object = tiss, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
Display the top markers you computed above.
tiss.markers %>% group_by(cluster) %>% top_n(5, avg_diff)
At a coarse level, we can use canonical markers to match the unbiased clustering to known cell types:
0: heterogenous group of cells 1: endothelial cells 2: adipocytes 3: endothelial cells 4: fibroblasts 5: hematopoetic
# stash current cluster IDs
tiss <- StashIdent(object = tiss, save.name = "cluster.ids")
# enumerate current cluster IDs and the labels for them
cluster.ids <- c(0, 1, 2, 3, 4, 5)
annotation <- c("heterogenous group of cells", "endothelial cell", "epicardial adipocytes", "endothelial cells", "fibroblasts", "hematopoetic cells")
cell_ontology_id = c(NA, "CL:0000115", "CL:1000309", "CL:0000115", "CL:0000057", "CL:0000988")
tiss@meta.data[,'annotation'] <- plyr::mapvalues(x = tiss@meta.data$cluster.ids, from = cluster.ids, to = annotation)
tiss@meta.data[,'cell_ontology_id'] <- plyr::mapvalues(x = tiss@meta.data$cluster.ids, from = cluster.ids, to = cell_ontology_id)
TSNEPlot(object = tiss, do.label = TRUE, pt.size = 0.5, group.by='annotation')
head(tiss@meta.data)
Color by metadata, like plate barcode, to check for batch effects.
TSNEPlot(object = tiss, do.return = TRUE, group.by = "plate.barcode")
Print a table showing the count of cells in each identity category from each plate.
table(as.character(tiss@ident), as.character(tiss@meta.data$plate.barcode))
B002430 B002431 MAA000594 MAA000595 MAA000906 MAA000908
0 5 16 65 25 12 4
1 0 0 1 2 3 78
2 12 38 1 1 2 0
3 2 1 0 0 33 11
4 0 0 17 17 1 0
5 0 0 2 15 0 0
We can repeat the above analysis on a subset of cells, defined using cluster IDs or some other metadata. This is a good way to drill down and find substructure.
# Subset data based on cluster id
subtiss <- SubsetData(object = tiss, ident.use = c(0), do.center = F, do.scale = F, cells.use = )
# To subset data based on annotation or other metadata, you can explicitly pass cell names
# anno = 'thymocyte'
# cells.to.use = tiss@cell.names[which(tiss@meta.data$annotation == anno)]
# subtiss <- SubsetData(object = tiss, cells.use = cells.to.use, do.center = F, do.scale = F)
subtiss <- NormalizeData(object = subtiss)
subtiss <- ScaleData(object = subtiss, vars.to.regress = c("nReads", "percent.ribo","Rn45s"))
[1] "Regressing out nReads" "Regressing out percent.ribo"
[3] "Regressing out Rn45s"
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
[1] "Scaling data matrix"
|
| | 0%
|
|=================================================================| 100%
subtiss <- FindVariableGenes(object = subtiss, do.plot = TRUE, x.high.cutoff = Inf, y.cutoff = 0.8)
subtiss <- RunPCA(object = subtiss, pcs.compute = 20)
[1] "PC1"
[1] "Dll3" "Gja4" "Prph2" "Il4"
[5] "Hspb9" "Grin2d" "Fam55d" "Gcgr"
[9] "Ankdd1b" "Scn4a" "Ptch2" "Kcnc1"
[13] "Gltscr1" "Park2" "Ahrr" "Intu"
[17] "Ybey" "Cyp46a1" "Gulo" "6330410L21Rik"
[21] "Tmem181b-ps" "Pla2g2c" "Sectm1b" "Mcpt8"
[25] "Hcn3" "C330046G13Rik" "Scgb2b1-ps" "Gm6623"
[29] "Scml4" "Vmn2r117"
[1] ""
[1] "Dvl3" "Dusp23" "Fam114a1" "Srpk1"
[5] "Slmo2" "Emd" "Vmn2r57" "Sema3a"
[9] "Bcl10" "Ccdc42" "Brd7" "U2af1l4"
[13] "Efna1" "Atp6v1h" "Dapk2" "Grid2ip"
[17] "Polr1a" "Mrpl33" "1810011O10Rik" "Cxcl9"
[21] "Cd200" "Mms22l" "4921513D23Rik" "Trp53rk"
[25] "Efcab4a" "Ctnnbip1" "Car4" "Wipi1"
[29] "Erg" "Efnb1"
[1] ""
[1] ""
[1] "PC2"
[1] "Slc26a9" "March3" "Cramp1l" "Pabpc1l"
[5] "Fbxo22" "Zfp866" "Col24a1" "9930111J21Rik1"
[9] "Esr2" "Polq" "Trim72" "Slc2a9"
[13] "Styk1" "Airn" "Kif14" "Slc24a1"
[17] "Slc11a1" "Ccdc122" "Gm101" "Fbxw14"
[21] "Degs2" "Tcte2" "Atg9b" "Gm6086"
[25] "Rnf207" "Rufy2" "Fmo9" "Il1f8"
[29] "Cyp21a1" "Actl6b"
[1] ""
[1] "BC030307" "Tmem63c" "Gpr17" "Tmprss13"
[5] "Slc2a4rg-ps" "Sfmbt1" "Atp8a2" "Olfr1329"
[9] "Irx5" "Hecw2" "Caml" "Clgn"
[13] "Kbtbd5" "Cftr" "Gm2115" "Ces1c"
[17] "Zfp780b" "Olfr1419" "Zfp599" "5430425J12Rik"
[21] "Sbf1" "Gm6936" "Lancl3" "Gm805"
[25] "Pax1" "Slc22a4" "Olfr6" "Myf6"
[29] "Pde6g" "Myh8"
[1] ""
[1] ""
[1] "PC3"
[1] "Dvl3" "Dusp23" "Srpk1" "Efna1"
[5] "Nubpl" "Dapk2" "Senp7" "1810011O10Rik"
[9] "Efnb1" "6230427J02Rik" "Efcab4a" "Bcl10"
[13] "Mrpl33" "Klf7" "Car4" "Ecscr"
[17] "Lyar" "Trp53rk" "Gm9320" "Hist2h2ac"
[21] "Erg" "Ctnnbip1" "U2af1l4" "Sdpr"
[25] "Lrp6" "Dnalc4" "Exosc10" "Srrd"
[29] "2700049A03Rik" "Lifr"
[1] ""
[1] "Bbs1" "Drosha" "Lrp2bp" "Sbf1"
[5] "Caml" "Zfp780b" "Wbscr17" "Lrig3"
[9] "Susd1" "Mamstr" "Th" "Gm2115"
[13] "Myh8" "Serpini1" "Tmprss13" "Kbtbd5"
[17] "2810011L19Rik" "Zfp599" "Gm6936" "Igll1"
[21] "Pde6g" "Olfr1329" "Slc22a4" "Dab2"
[25] "Olfr1419" "Ncf2" "Armc4" "C87414"
[29] "Irx5" "Pax1"
[1] ""
[1] ""
[1] "PC4"
[1] "Wdr59" "Ddx60" "Wdr70" "Wdr64"
[5] "Ppfibp1" "Cst13" "Fam83g" "Gpr113"
[9] "Zfhx2" "Rgag1" "6030429G01Rik" "Dusp13"
[13] "Sh3bp2" "A230057D06Rik" "Trp53i13" "Ndufs5"
[17] "Anxa10" "Eif2c3" "Olfr1013" "Krt76"
[21] "Hba-x" "Gm12794" "Ankrd40" "Ushbp1"
[25] "Tmod4" "2310007B03Rik" "Trim56" "Tfap2d"
[29] "Gm5122" "Gm628"
[1] ""
[1] "Adarb1" "Ppp3r2" "C330021F23Rik" "Mmachc"
[5] "Plekhh1" "Rgs9" "Dio1" "Hspb2"
[9] "Arhgef9" "Nif3l1" "Unc80" "Zfp623"
[13] "Ccdc147" "Adprhl1" "Ppan" "Htr2c"
[17] "Tdpoz1" "Klf17" "Mybphl" "Zkscan16"
[21] "Gm10516" "Lama1" "Wdr69" "Btbd16"
[25] "Lhx1" "Mmrn2" "Grk5" "B4galnt3"
[29] "1110001J03Rik" "Rec8"
[1] ""
[1] ""
[1] "PC5"
[1] "Mtss1l" "Zeb2" "Rab3a" "Mtap1b"
[5] "Sez6l" "Trp53i13" "Ankrd40" "Scube2"
[9] "Fam83g" "Gm12794" "Yeats2" "Ndufs5"
[13] "Rfwd3" "Trim56" "Dnajc27" "Nr4a2"
[17] "Sh3bp2" "Wdr59" "Mtap9" "Eif2c3"
[21] "Slc36a3" "Gm5122" "Tmod4" "4930543E12Rik"
[25] "Gm628" "Tyms-ps" "Cenpp" "Gm10510"
[29] "Tprg" "2310007B03Rik"
[1] ""
[1] "Scnm1" "Pygo2" "Lyar" "Trp53rk"
[5] "Dapk2" "2810025M15Rik" "1810011O10Rik" "Efna1"
[9] "1110019D14Rik" "Prkch" "Mettl11a" "Efcab4a"
[13] "Nubpl" "Ctnnbip1" "Car4" "Dcaf5"
[17] "Kpna3" "Erg" "Brd7" "Gm9320"
[21] "Nxt2" "Hsdl2" "Srrd" "2700049A03Rik"
[25] "Eng" "Efnb1" "Hist2h2ac" "Tomm22"
[29] "Senp7" "U2af1l4"
[1] ""
[1] ""
subtiss <- ProjectPCA(object = subtiss, do.print = FALSE)
# If this fails for your subset, it may be that cells.use is more cells than you have left! Try reducing it.
PCHeatmap(object = subtiss, pc.use = 1:3, cells.use = 100, do.balanced = TRUE, label.columns = FALSE, num.genes = 12)
Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
PCElbowPlot(object = subtiss)
Choose the number of principal components to use.
# Set number of principal components.
sub.n.pcs = 5
The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity–more connections within the group than between groups. The resolution parameter determines the scale…higher resolution will give more clusters, lower resolution will give fewer.
# Set resolution
sub.res.used <- 1
subtiss <- FindClusters(object = subtiss, reduction.type = "pca", dims.use = 1:sub.n.pcs,
resolution = sub.res.used, print.output = 0, save.SNN = TRUE, force.recalc = TRUE)
To visualize
# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
subtiss <- RunTSNE(object = subtiss, dims.use = 1:sub.n.pcs, seed.use = 10, perplexity=20)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = subtiss, do.label = T)
subtiss.markers <- FindAllMarkers(object = subtiss, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
subtiss.markers %>% group_by(cluster) %>% top_n(6, avg_diff)
Check expression of genes of interset.
genes_to_check = (subtiss.markers %>% group_by(cluster) %>% top_n(6, avg_diff))$gene
# genes_to_check = c('Vim','Krt5','Krt8','Ptprc','Epcam','H2-D1','H2-Aa','H2-Ab1','Cd34','Itgam','Syk','Col1a2','Timp2','Cd74','Laptm5','Selplg','Fabp4','Icam1','Vcam1','Adipor1','Alas2','Cnn1','Acta2','Dcn','Myocd','Mmp2','Lmna','Eln','Col3a1','Col4a1','Col1a1','Pdgfra','Gypa','Col6a3','Pecam1','Pdgfrb','Psmb8','Myh10','Tpm4','Cald1','Rbp1', 'Cd36')
FeaturePlot(subtiss, genes_to_check, pt.size = 1)
Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest.
How big are the clusters?
table(subtiss@ident)
0 1 2
51 39 37
Color by metadata, like plate barcode, to check for batch effects.
TSNEPlot(object = subtiss, do.return = TRUE, group.by = "plate.barcode")
Print a table showing the count of cells in each identity category from each plate.
table(as.character(subtiss@ident), as.character(subtiss@meta.data$plate.barcode))
B002430 B002431 MAA000594 MAA000595 MAA000906 MAA000908
0 5 10 19 4 11 2
1 0 6 23 7 1 2
2 0 0 23 14 0 0
For the subsets, we produce subannotations. These will be written back as metadata in the original object, so we can see all subannotations together.
If some of the clusters you find in the subset deserve additional annotation, you can add that right here. Use NA for clusters for which no subannotation is needed.
subcluster.ids <- c(0, 1,2)
subannotation <- c(NA, "epicardial adipocyte", "smooth muscle cell")
sub_cell_ontology_id = c(NA, "CL:1000309", "CL:0000192")
subtiss@meta.data[,'annotation'] <- plyr::mapvalues(x = subtiss@ident, from = subcluster.ids, to = subannotation)
subtiss@meta.data[,'cell_ontology_id'] <- plyr::mapvalues(x = subtiss@ident, from = subcluster.ids, to = sub_cell_ontology_id)
tiss@meta.data[subtiss@cell.names,'annotation'] <- as.character(subtiss@meta.data$annotation)
tiss@meta.data[subtiss@cell.names,'cell_ontology_id'] <- as.character(subtiss@meta.data$cell_ontology_id)
TSNEPlot(object = tiss, do.label = TRUE, pt.size = 0.5, group.by='annotation')
Warning: Removed 1 rows containing missing values (geom_text).
Save the tissue object with updated annotations
filename = here('00_data_ingest', '04_tissue_robj_generated',
paste0(tissue_of_interest, "_seurat_tiss.Robj"))
print(filename)
[1] "/Users/olgabot/code/tabula-muris/00_data_ingest/04_tissue_robj_generated/Aorta_seurat_tiss.Robj"
save(tiss, file=filename)
So that Biohub can easily combine all your annotations, please export them as a simple csv.
filename = here('00_data_ingest', '03_tissue_annotation_csv',
paste0(tissue_of_interest, "_annotation.csv"))
write.csv(tiss@meta.data[,c('plate.barcode','annotation','cell_ontology_id')], file=filename)